{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "40ce79be",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([[0.9, 0.1, 0. , 0. , 0. , 0. ],\n",
       "        [0.5, 0. , 0.5, 0. , 0. , 0. ],\n",
       "        [0. , 0. , 0. , 0.6, 0. , 0.4],\n",
       "        [0. , 0. , 0. , 0. , 0.3, 0.7],\n",
       "        [0. , 0.2, 0.3, 0.5, 0. , 0. ],\n",
       "        [0. , 0. , 0. , 0. , 0. , 1. ]]),\n",
       " array([-1, -2, -2, 10,  1,  0]))"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "#状态转移概率矩阵\n",
    "P = np.array([\n",
    "    [0.9, 0.1, 0.0, 0.0, 0.0, 0.0],\n",
    "    [0.5, 0.0, 0.5, 0.0, 0.0, 0.0],\n",
    "    [0.0, 0.0, 0.0, 0.6, 0.0, 0.4],\n",
    "    [0.0, 0.0, 0.0, 0.0, 0.3, 0.7],\n",
    "    [0.0, 0.2, 0.3, 0.5, 0.0, 0.0],\n",
    "    [0.0, 0.0, 0.0, 0.0, 0.0, 1.0],\n",
    "])\n",
    "\n",
    "#到达每一个状态的奖励\n",
    "R = np.array([-1, -2, -2, 10, 1, 0])\n",
    "\n",
    "P, R"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b68758d1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-2.5"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#给定一条序列,计算回报\n",
    "def value_by_chain(chain):\n",
    "    s = 0\n",
    "    for i, c in enumerate(chain):\n",
    "        #给每一步的反馈做一个系数,随着步数往前衰减\n",
    "        s += R[c] * 0.5**i\n",
    "\n",
    "    #最终的反馈是所有步数衰减后的求和\n",
    "    return s\n",
    "\n",
    "\n",
    "value_by_chain(np.array([0, 1, 2, 5]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ada08c2b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-2.01950168e+00, -2.21451846e+00,  1.16142785e+00,  1.05380928e+01,\n",
       "        3.58728554e+00,  6.22301528e-61])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#梯度下降法计算贝尔曼矩阵\n",
    "def get_bellman():\n",
    "    #初始化values\n",
    "    value = np.ones([6])\n",
    "\n",
    "    for _ in range(200):\n",
    "        for i in range(6):\n",
    "            #每一行的概率和它对应的value相乘，乘以gamma，然后和奖励相加\n",
    "            #反复计算，就收敛到了贝尔曼方程矩阵\n",
    "            value[i] = R[i] + 0.5 * P[i].dot(value)\n",
    "\n",
    "    return value\n",
    "\n",
    "\n",
    "get_bellman()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "480f9498",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-2.01950168, -2.21451846,  1.16142785, 10.53809283,  3.58728554,\n",
       "        0.        ])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#解析解贝尔曼方程矩阵\n",
    "def get_bellman():\n",
    "    mat = np.eye(*P.shape)\n",
    "    mat -= 0.5 * P\n",
    "    mat = np.linalg.inv(mat)\n",
    "\n",
    "    return mat.dot(R)\n",
    "\n",
    "\n",
    "get_bellman()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
